In [64]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#visualization libraries
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.style as style
import matplotlib.pyplot as plt
from matplotlib import colors
import matplotlib.pyplot as plt, numpy as np
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
from IPython.display import Image
import plotly.express as px
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
from plotly import tools
import seaborn as sns

import missingno as msno #to visualize missing data

from imblearn.over_sampling import SMOTE
import itertools


from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,precision_score,recall_score,roc_auc_score,f1_score,plot_confusion_matrix,plot_roc_curve,roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.preprocessing import LabelEncoder
In [2]:
pwd
Out[2]:
'/Users/juliet'

For this project, we will be analyzing a credit card approval dataset. Financial institutions recieve many applications for credit cards daily. There are many factors that contribute to a person being approved or denied for a credit card. These factors cannot be overlooked by a person becuase they are time consuming. The task of analyzing the credit card approval factors can be easily done with the power of machine learning. We will build a model that will predict if an applicant is a 'good' or 'bad' applicant.

In [3]:
CC = pd.read_csv("/Users/juliet/Dropbox/My Mac (Juliet’s MacBook Air)/Downloads/credit_record.csv")
In [4]:
CC.head()
Out[4]:
ID MONTHS_BALANCE STATUS
0 5001711 0 X
1 5001711 -1 0
2 5001711 -2 0
3 5001711 -3 0
4 5001712 0 C
In [5]:
AR = pd.read_csv("/Users/juliet/Dropbox/My Mac (Juliet’s MacBook Air)/Downloads/application_record.csv")
In [6]:
AR.head()
Out[6]:
ID CODE_GENDER FLAG_OWN_CAR FLAG_OWN_REALTY CNT_CHILDREN AMT_INCOME_TOTAL NAME_INCOME_TYPE NAME_EDUCATION_TYPE NAME_FAMILY_STATUS NAME_HOUSING_TYPE DAYS_BIRTH DAYS_EMPLOYED FLAG_MOBIL FLAG_WORK_PHONE FLAG_PHONE FLAG_EMAIL OCCUPATION_TYPE CNT_FAM_MEMBERS
0 5008804 M Y Y 0 427500.0 Working Higher education Civil marriage Rented apartment -12005 -4542 1 1 0 0 NaN 2.0
1 5008805 M Y Y 0 427500.0 Working Higher education Civil marriage Rented apartment -12005 -4542 1 1 0 0 NaN 2.0
2 5008806 M Y Y 0 112500.0 Working Secondary / secondary special Married House / apartment -21474 -1134 1 0 0 0 Security staff 2.0
3 5008808 F N Y 0 270000.0 Commercial associate Secondary / secondary special Single / not married House / apartment -19110 -3051 1 0 1 1 Sales staff 1.0
4 5008809 F N Y 0 270000.0 Commercial associate Secondary / secondary special Single / not married House / apartment -19110 -3051 1 0 1 1 Sales staff 1.0

The number of clients and the amount of rows are not equal, this means there are duplicates.

Analyze the Missing Vaues

In [7]:
missing1 = msno.matrix(AR)

missing1.set_title("Missing data for Application Records dataset", fontsize = 30)
Out[7]:
Text(0.5, 1.0, 'Missing data for Application Records dataset')

Occupation type is missing 30% of its values.

In [8]:
missing2 = msno.matrix(CC)

missing2.set_title("Missing data for Credit Records dataset", fontsize = 30)
Out[8]:
Text(0.5, 1.0, 'Missing data for Credit Records dataset')

Analyze Unique Counts

In [9]:
counts1 = pd.DataFrame.from_records([(col, AR[col].nunique()) for col in AR.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
In [10]:
counts2 = pd.DataFrame.from_records([(col, CC[col].nunique()) for col in CC.columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])
counts2
Out[10]:
Column_Name Num_Unique
2 STATUS 8
1 MONTHS_BALANCE 61
0 ID 45985

Data Visualization

In [11]:
sns.set_context("notebook",font_scale=.7,rc={"grid.linewidth": 0.1,'patch.linewidth': 0.0,
    "axes.grid":True,
    "grid.linestyle": "-",
    "axes.titlesize" : 13,                                       
    "figure.autolayout":True})
                
palette1 = ['#FF5E5B','#EC9B9A','#00CECB','#80DE99','#C0E680','#FFED66']

sns.set_palette(sns.color_palette(sns.color_palette(palette1)))
In [12]:
plt.figure(figsize=(10,10))

plot1 = ["CNT_CHILDREN","AMT_INCOME_TOTAL","DAYS_BIRTH","DAYS_EMPLOYED"]
AR[plot1].hist(edgecolor='black', linewidth=1.2)
fig=plt.gcf()
fig.set_size_inches(12,6)
<Figure size 720x720 with 0 Axes>

We can see that there are outliers in children count and amount of total income.

In [13]:
fig, axes = plt.subplots(1,2)

plot2=sns.countplot(y=AR.NAME_INCOME_TYPE,linewidth=1.2, ax=axes[0])
plot2.set_title("Customer Distribution by Income Type")
plot2.set_xlabel("Count")

plot3=sns.countplot(y=AR.NAME_FAMILY_STATUS,linewidth=1.2, ax=axes[1])
plot3.set_title("Customer Distribution by Family Status")
plot3.set_xlabel("Count")

fig.set_size_inches(14,5)

plt.tight_layout()


plt.show()
fig, axes = plt.subplots(1,2) plot2= sns.countplot(y=AR.NAME_HOUSING_TYPE,linewidth=1.2, ax=axes[0]) plot2.set_title("Customer Distribution by Housing Type") plot2.set_xlabel("Count") plot2.set_ylabel("Housing Type") plot3= sns.countplot(y=AR.NAME_EDUCATION_TYPE, ax=axes[1]) plot3.set_title("Customer Distribution by Education") plot3.set_xlabel("Count") plot3.set_ylabel("Education Type") fig.set_size_inches(14,5) plt.tight_layout() plt.show()
In [14]:
fig, axes = plt.subplots(1,3)

plot4= AR['CODE_GENDER'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True, colors=["#76B5B3","#EC9B9A"],textprops = {'fontsize':12}, ax=axes[0])
plot4.set_title("Customer Distribution by Gender")

plot5= AR['FLAG_OWN_CAR'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True,colors=["#80DE99","#00CECB"],textprops = {'fontsize':12}, ax=axes[1])
plot5.set_title("Car Ownership")

plot6= AR['FLAG_OWN_REALTY'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True,colors=["#76B5B3","#00CECB"],textprops = {'fontsize':12}, ax=axes[2])
plot6.set_title("Realty Ownership")

fig.set_size_inches(14,5)

plt.tight_layout()

plt.show()

Now, we will clean and prepare the raw data to enable feature engineering.

In [15]:
AR = AR.drop_duplicates('ID', keep='last') #remove duplicate values and keep the last entry of the ID if its repeated.
AR.drop('OCCUPATION_TYPE', axis=1, inplace=True) #dropped the missing values in occupation type. 
In [16]:
column1 = AR.columns[AR.dtypes =='object'].tolist() #object columns in dataset

counts2 = pd.DataFrame.from_records([(col, AR[column1][col].nunique()) for col in AR[column1].columns],
                          columns=['Column_Name', 'Num_Unique']).sort_values(by=['Num_Unique'])

counts2 #unique counts for object columns 
Out[16]:
Column_Name Num_Unique
0 CODE_GENDER 2
1 FLAG_OWN_CAR 2
2 FLAG_OWN_REALTY 2
3 NAME_INCOME_TYPE 5
4 NAME_EDUCATION_TYPE 5
5 NAME_FAMILY_STATUS 5
6 NAME_HOUSING_TYPE 6

Here, we can see the columns that have non numeric values. We will convert them to numeric columns if needed.

In [17]:
AR.rename(columns={"CODE_GENDER":"Gender","FLAG_OWN_CAR":"Own_Car","FLAG_OWN_REALTY":"Own_Realty",
                     "CNT_CHILDREN":"Children_Count","AMT_INCOME_TOTAL":"Income","NAME_EDUCATION_TYPE":"Education",
                     "NAME_FAMILY_STATUS":"Family_Status","NAME_HOUSING_TYPE":"Housing_Type","DAYS_BIRTH":"Birthday",
                     "DAYS_EMPLOYED":"Employment_Date","FLAG_MOBIL":"Own_Mobile","FLAG_WORK_PHONE":"Own_Work_Phone",
                     "FLAG_PHONE":"Own_Phone","FLAG_EMAIL":"Own_Email","CNT_FAM_MEMBERS":"Family_Member_Count",
                    "NAME_INCOME_TYPE":"Income_Type"},inplace=True)
In [18]:
open1=pd.DataFrame(CC.groupby(["ID"])["MONTHS_BALANCE"].agg(min))
open1=open1.rename(columns={'MONTHS_BALANCE':'begin_month'}) 
indiv=pd.merge(AR,open1,how="left",on="ID") #merge to record data

#convert categoric features into numeric

indiv["Gender"] =  indiv['Gender'].replace(['F','M'],[0,1])
indiv["Own_Car"] = indiv["Own_Car"].replace(["Y","N"],[1,0])
indiv["Own_Realty"] = indiv["Own_Realty"].replace(["Y","N"],[1,0])
indiv["Is_Working"] = indiv["Income_Type"].replace(["Working","Commercial associate","State servant","Pensioner","Student"],[1,1,1,0,0])

indiv["In_Relationship"] = indiv["Family_Status"].replace(["Civil marriage","Married","Single / not married",
                                                                          "Separated","Widow"],[1,1,0,0,0])

housing = {'House / apartment' : 'House / apartment',
                   'With parents': 'With parents',
                    'Municipal apartment' : 'House / apartment',
                    'Rented apartment': 'House / apartment',
                    'Office apartment': 'House / apartment',
                    'Co-op apartment': 'House / apartment'}

indiv["Housing_Type"] = indiv['Housing_Type'].map(housing)

household = {'Single / not married':'Single',
                     'Separated':'Single',
                     'Widow':'Single',
                     'Civil marriage':'Married',
                    'Married':'Married'}

indiv["Family_Status"] = indiv["Family_Status"].map(household)

education = {'Secondary / secondary special':'secondary',
                     'Lower secondary':'secondary',
                     'Higher education':'Higher education',
                     'Incomplete higher':'Higher education',
                     'Academic degree':'Academic degree'}


indiv["Education"] = indiv["Education"].map(education)

income = {'Commercial associate':'Working',
                  'State servant':'Working',
                  'Working':'Working',
                  'Pensioner':'Pensioner',
                  'Student':'Student'}
indiv["Income_Type"] = indiv["Income_Type"].map(income)

indiv["Household_Size"] = indiv["Children_Count"] + indiv["In_Relationship"].apply(lambda x: 2 if x==1 else 1)

indiv["Age"] = round((indiv.Birthday/365)*-1)

indiv["Experience"] = indiv.Employment_Date/365
indiv['Experience']=indiv['Experience'].apply(lambda v : int(v*-1) if v <0 else 0)

indiv=indiv.drop(columns=['Employment_Date','Birthday','Children_Count'])

indiv= pd.get_dummies(indiv, columns=['Income_Type', 'Education','Family_Status',"Housing_Type"])
In [19]:
indiv.head()
Out[19]:
ID Gender Own_Car Own_Realty Income Own_Mobile Own_Work_Phone Own_Phone Own_Email Family_Member_Count ... Income_Type_Pensioner Income_Type_Student Income_Type_Working Education_Academic degree Education_Higher education Education_secondary Family_Status_Married Family_Status_Single Housing_Type_House / apartment Housing_Type_With parents
0 5008804 1 1 1 427500.0 1 1 0 0 2.0 ... 0 0 1 0 1 0 1 0 1 0
1 5008805 1 1 1 427500.0 1 1 0 0 2.0 ... 0 0 1 0 1 0 1 0 1 0
2 5008806 1 1 1 112500.0 1 0 0 0 2.0 ... 0 0 1 0 0 1 1 0 1 0
3 5008808 0 0 1 270000.0 1 0 1 1 1.0 ... 0 0 1 0 0 1 0 1 1 0
4 5008809 0 0 1 270000.0 1 0 1 1 1.0 ... 0 0 1 0 0 1 0 1 1 0

5 rows × 26 columns

In [20]:
column2 = ["Income", "Age", "Experience", "Household_Size"]

fig = make_subplots(rows=2, cols=2, start_cell="bottom-left",
                   subplot_titles=("Income", "Age", "Experience", "Family Member Count"))

fig.add_trace(go.Box(x=indiv.Income, name='Income',boxmean=True),row=1,col=1)
fig.add_trace(go.Box(x=indiv.Age, name='Age', boxmean=True), row=1, col=2)
fig.add_trace(go.Box(x=indiv.Experience, name='Experience', boxmean=True), row=2, col=1)
fig.add_trace(go.Box(x=indiv.Household_Size, name="Family Member Count", boxmean=True),row=2, col=2)

fig.show()

The box plots above display the outliers in children count, family member count, income and employment rate columns. In order to create a accurate model we need to remove them bu using z scores.

In [21]:
def calculate_z_scores(df, cols):
    for col in cols:
        df[col+"_z_score"] = (df[col] - df[col].mean())/df[col].std()
    return df

deg = calculate_z_scores(df = indiv, cols = ["Income","Experience","Household_Size"])


#removing outliers
R1 = deg.Household_Size_z_score.abs() <= 3.5
R2 = deg.Experience_z_score.abs() <= 3.5
R3 = deg.Income_z_score.abs() <= 3.5

app1 = deg[R1 & R2 & R3]

app1.drop(columns= ["Income_z_score","Experience_z_score","Household_Size_z_score"],inplace=True)
In [22]:
column3 = ["Income","Age","Experience","Family_Member_Count"]

fig = make_subplots(rows=2, cols=2, start_cell="bottom-left",
                   subplot_titles=("Income", "Age", "Experience", "Family Member Count"))

fig.add_trace(go.Box(x=app1.Income, name='Income',boxmean=True),row=1,col=1)
fig.add_trace(go.Box(x=app1.Age, name='Age', boxmean=True), row=1, col=2)
fig.add_trace(go.Box(x=app1.Experience, name='Experience', boxmean=True), row=2, col=1)
fig.add_trace(go.Box(x=app1.Household_Size, name="Family Member Count", boxmean=True),row=2, col=2)

fig.show()
In [23]:
CC['dep_value'] = None
CC['dep_value'][CC['STATUS'] =='2']='Yes' 
CC['dep_value'][CC['STATUS'] =='3']='Yes' 
CC['dep_value'][CC['STATUS'] =='4']='Yes' 
CC['dep_value'][CC['STATUS'] =='5']='Yes' 

check=CC.groupby('ID').count()
check['dep_value'][check['dep_value'] > 0]='Yes' 
check['dep_value'][check['dep_value'] == 0]='No' 
check = check[['dep_value']]
In [24]:
# Data to analyze length of time since initial approval of credit card
# Shows number of past dues, paid off and no loan status.
group = CC.groupby('ID')

pivot1 = CC.pivot(index = 'ID', columns = 'MONTHS_BALANCE', values = 'STATUS')
pivot1['open_month'] = group['MONTHS_BALANCE'].min()
pivot1['end_month'] = group['MONTHS_BALANCE'].max()
pivot1['window'] = pivot1['end_month'] - pivot1['open_month']
pivot1['window'] += 1 # Adding 1 since month starts at 0.

#Counting number of past dues, paid offs and no loans.
pivot1['paid_off'] = pivot1[pivot1.iloc[:,0:61] == 'C'].count(axis = 1)
pivot1['pastdue_1-29'] = pivot1[pivot1.iloc[:,0:61] == '0'].count(axis = 1)
pivot1['pastdue_30-59'] = pivot1[pivot1.iloc[:,0:61] == '1'].count(axis = 1)
pivot1['pastdue_60-89'] = pivot1[pivot1.iloc[:,0:61] == '2'].count(axis = 1)
pivot1['pastdue_90-119'] = pivot1[pivot1.iloc[:,0:61] == '3'].count(axis = 1)
pivot1['pastdue_120-149'] = pivot1[pivot1.iloc[:,0:61] == '4'].count(axis = 1)
pivot1['pastdue_over_150'] = pivot1[pivot1.iloc[:,0:61] == '5'].count(axis = 1)
pivot1['no_loan'] = pivot1[pivot1.iloc[:,0:61] == 'X'].count(axis = 1)
#Setting Id column to merge with app data.
pivot1['ID'] = pivot1.index
In [25]:
pivot1.head()
Out[25]:
MONTHS_BALANCE -60 -59 -58 -57 -56 -55 -54 -53 -52 -51 ... window paid_off pastdue_1-29 pastdue_30-59 pastdue_60-89 pastdue_90-119 pastdue_120-149 pastdue_over_150 no_loan ID
ID
5001711 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 4 0 3 0 0 0 0 0 1 5001711
5001712 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 19 9 10 0 0 0 0 0 0 5001712
5001713 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 22 0 0 0 0 0 0 0 22 5001713
5001714 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN ... 15 0 0 0 0 0 0 0 15 5001714
5001715 NaN X X X X X X X X X ... 60 0 0 0 0 0 0 0 60 5001715

5 rows × 73 columns

In [26]:
plot3 = pd.DataFrame()
plot3['ID'] = pivot1.index
plot3['paid_off'] = pivot1['paid_off'].values
plot3['#_of_pastdues'] = pivot1['pastdue_1-29'].values+ pivot1['pastdue_30-59'].values + pivot1['pastdue_60-89'].values +pivot1['pastdue_90-119'].values+pivot1['pastdue_120-149'].values +pivot1['pastdue_over_150'].values
plot3['no_loan'] = pivot1['no_loan'].values
app2 = app1.merge(plot3, how = 'inner', on = 'ID')

app3=pd.merge(app2,check,how='inner',on='ID')
app3['target']=app3['dep_value']
app3.loc[app3['target']=='Yes','target']=1
app3.loc[app3['target']=='No','target']=0

app3.drop(columns=["dep_value"],inplace=True)
In [27]:
matplotlib.rcParams.update(matplotlib.rcParamsDefault)

f, ax = plt.subplots(figsize=(15,15))
map1 = sns.diverging_palette(230, 20, as_cmap=True)
corr = app3.drop(columns=["Own_Mobile"]).corr().round(1)
map2 = np.triu(np.ones_like(corr, dtype=bool))
sns.heatmap(corr, annot=True, mask = map2, cmap=map1)
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fbb3333e970>
In [32]:
app3['target'].value_counts().plot.pie(explode=[0.1,0.1],autopct='%1.1f%%',shadow=True, colors=['#FF5E5B', '#C0E680'],textprops = {'fontsize':7}).set_title("Target distribution")

plt.show()
In [33]:
sns.set_context("notebook",font_scale=.7,rc={"grid.linewidth": 0.1,'patch.linewidth': 0.0,
    "axes.grid":True,
    "grid.linestyle": "-",
    "axes.titlesize" : 13,                                       
    'figure.figsize':(15,15)})
                
palette2 = ['#FF5E5B','#EC9B9A','#00CECB','#80DE99','#C0E680','#FFED66']

sns.set_palette(sns.color_palette(sns.color_palette(palette2)))
In [37]:
fig, axes = plt.subplots(1,3)

plot4=sns.boxenplot(x='target', y='Income', data=app3,palette=['#FF5E5B', '#C0E680'], ax=axes[0])
plot4.set_title("Income-Target")
plot5=sns.boxenplot(x='target', y='Age', data=app3,palette=['#FF5E5B', '#C0E680'], ax=axes[1])
plot5.set_title("Age-Target")
plot6=sns.boxenplot(x='target', y='Experience', data=app3,palette=['#FF5E5B', '#C0E680'], ax=axes[2])
plot6.set_title("Work Experience-Target")

fig.set_size_inches(14,5)

plt.tight_layout()
In [39]:
sns.displot(data=app3, x='Income', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=app3, x='Age', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=app3, x='Experience', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=app3, x='begin_month', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
Out[39]:
<seaborn.axisgrid.FacetGrid at 0x7fbb1e398f40>
In [40]:
sns.displot(data=app3, x='no_loan', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=app3, x='#_of_pastdues', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
sns.displot(data=app3, x='paid_off', hue="Is_Working", col='target', kind="kde", height=4, facet_kws={'sharey': False, 'sharex': False},palette=['#C70039','#80DE99'])
Out[40]:
<seaborn.axisgrid.FacetGrid at 0x7fbb0e930910>
In [41]:
app3.head()
Out[41]:
ID Gender Own_Car Own_Realty Income Own_Mobile Own_Work_Phone Own_Phone Own_Email Family_Member_Count ... Education_Higher education Education_secondary Family_Status_Married Family_Status_Single Housing_Type_House / apartment Housing_Type_With parents paid_off #_of_pastdues no_loan target
0 5008804 1 1 1 427500.0 1 1 0 0 2.0 ... 1 0 1 0 1 0 13 2 1 0
1 5008805 1 1 1 427500.0 1 1 0 0 2.0 ... 1 0 1 0 1 0 12 2 1 0
2 5008806 1 1 1 112500.0 1 0 0 0 2.0 ... 0 1 1 0 1 0 7 7 16 0
3 5008808 0 0 1 270000.0 1 0 1 1 1.0 ... 0 1 0 1 1 0 0 2 3 0
4 5008809 0 0 1 270000.0 1 0 1 1 1.0 ... 0 1 0 1 1 0 0 0 5 0

5 rows × 30 columns

Feature Selection: Feature Selection is a feature engineering component that involves the removal of irrelevant features and picks the best set of features to train a robust machine learning model.

Calculation the Weight of Evidence (WOE) The weight of evidence tells the predictive power of an independent variable in relation to the dependent variable.

In [42]:
def calc_iv(df, feature, target, pr=False):
    
    lst = []

    df[feature] = df[feature].fillna("NULL")

    for i in range(df[feature].nunique()):
        val = list(df[feature].unique())[i]
        lst.append([feature,                                                        # Variable
                    val,                                                            # Value
                    df[df[feature] == val].count()[feature],                        # All
                    df[(df[feature] == val) & (df[target] == 0)].count()[feature],  # Good (think: Fraud == 0)
                    df[(df[feature] == val) & (df[target] == 1)].count()[feature]]) # Bad (think: Fraud == 1)

    data = pd.DataFrame(lst, columns=['Variable', 'Value', 'All', 'Good', 'Bad'])
    
    data['Share'] = data['All'] / data['All'].sum()
    data['Bad Rate'] = data['Bad'] / data['All']
    data['Distribution Good'] = (data['All'] - data['Bad']) / (data['All'].sum() - data['Bad'].sum())
    data['Distribution Bad'] = data['Bad'] / data['Bad'].sum()
    data['WoE'] = np.log(data['Distribution Good'] / data['Distribution Bad'])

    data = data.replace({'WoE': {np.inf: 0, -np.inf: 0}})

    data['IV'] = data['WoE'] * (data['Distribution Good'] - data['Distribution Bad'])

    data = data.sort_values(by=['Variable', 'Value'], ascending=[True, True])
    data.index = range(len(data.index))

    if pr:
        print(data)
        print('IV = ', data['IV'].sum())

    iv = data['IV'].sum()

    return iv, data
In [43]:
features = app3.columns.tolist()[:-1]
iv_list = []
for feature in features:
    iv, data = calc_iv(app3, feature, 'target')
    iv_list.append(round(iv,4))

df2 = pd.DataFrame(np.column_stack([features, iv_list]), 
                      columns=['Feature', 'iv'])
df2
Out[43]:
Feature iv
0 ID 0.0
1 Gender 0.0141
2 Own_Car 0.0006
3 Own_Realty 0.0297
4 Income 0.1884
5 Own_Mobile 0.0
6 Own_Work_Phone 0.0023
7 Own_Phone 0.0006
8 Own_Email 0.0001
9 Family_Member_Count 0.0155
10 begin_month 0.3308
11 Is_Working 0.0116
12 In_Relationship 0.0155
13 Household_Size 0.0252
14 Age 0.138
15 Experience 0.0765
16 Income_Type_Pensioner 0.0118
17 Income_Type_Student 0.0
18 Income_Type_Working 0.0116
19 Education_Academic degree 0.0
20 Education_Higher education 0.0028
21 Education_secondary 0.0026
22 Family_Status_Married 0.0155
23 Family_Status_Single 0.0155
24 Housing_Type_House / apartment 0.0016
25 Housing_Type_With parents 0.0016
26 paid_off 0.1327
27 #_of_pastdues 1.1703
28 no_loan 0.0981
In [44]:
x=app3.loc[:, app3.columns != 'target']
y=app3.iloc[:,-1:]

X=x

Feature Scaling

In [45]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x)

X = pd.DataFrame(scaler.transform(x), columns=[x.columns])

SMOTE (Synthetic Minority Oversampling Technique)

Imbalanced classification involves developing predictive models on classification datasets that have a severe class imbalance.

The challenge of working with imbalanced datasets is that most machine learning techniques will ignore, and in turn have poor performance on, the minority class, although typically it is performance on the minority class that is most important.

One approach to addressing imbalanced datasets is to oversample the minority class. The simplest approach involves duplicating examples in the minority class, although these examples don’t add any new information to the model. Instead, new examples can be synthesized from the existing examples.

In [47]:
y = y.astype('int')
X_balance,Y_balance = SMOTE().fit_resample(X,y)
X_balance = pd.DataFrame(X_balance, columns = X.columns)

Value counts are now balanced.

RFE (Recursive Feature Elimination)

Recursive Feature Elimination is effective at selecting the features (columns) in a training dataset that are more or most relevant in predicting the target variable.

In [52]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

cols = app3.loc[:, app3.columns != 'target'].columns.tolist()
model = LogisticRegression(solver='liblinear')
rfe = RFE(model)
fit = rfe.fit(X_balance, Y_balance)
rfe_features = pd.DataFrame({"Feature":cols,
              "Support_LogisticRegression":fit.support_,
              "Feature_Rank_logisticRegression":fit.ranking_})
rfe_features
Out[52]:
Feature Support_LogisticRegression Feature_Rank_logisticRegression
0 ID False 6
1 Gender True 1
2 Own_Car True 1
3 Own_Realty True 1
4 Income False 9
5 Own_Mobile False 16
6 Own_Work_Phone False 5
7 Own_Phone True 1
8 Own_Email True 1
9 Family_Member_Count True 1
10 begin_month True 1
11 Is_Working False 10
12 In_Relationship False 11
13 Household_Size True 1
14 Age False 2
15 Experience True 1
16 Income_Type_Pensioner False 3
17 Income_Type_Student False 4
18 Income_Type_Working False 12
19 Education_Academic degree True 1
20 Education_Higher education False 8
21 Education_secondary False 13
22 Family_Status_Married False 14
23 Family_Status_Single False 15
24 Housing_Type_House / apartment True 1
25 Housing_Type_With parents False 7
26 paid_off True 1
27 #_of_pastdues True 1
28 no_loan True 1

Extra Tree Classifier

This class implements a meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and use averaging to improve the predictive accuracy and control over-fitting.

In [54]:
from sklearn.ensemble import ExtraTreesClassifier

model = ExtraTreesClassifier(n_estimators=10)
model.fit(X_balance, Y_balance)
feature_importances = pd.DataFrame({"Feature":cols,
              "Feature_Importance_ExtratreeClassifier":model.feature_importances_})

Results from Feature Selection Methods

In [56]:
df1=pd.merge(df2, feature_importances, on=["Feature"])
feature_selection_df = pd.merge(df1, rfe_features, on=["Feature"])
feature_selection_df
Out[56]:
Feature iv Feature_Importance_ExtratreeClassifier Support_LogisticRegression Feature_Rank_logisticRegression
0 ID 0.0 0.064964 False 6
1 Gender 0.0141 0.045177 True 1
2 Own_Car 0.0006 0.049761 True 1
3 Own_Realty 0.0297 0.052699 True 1
4 Income 0.1884 0.054067 False 9
5 Own_Mobile 0.0 0.000000 False 16
6 Own_Work_Phone 0.0023 0.032620 False 5
7 Own_Phone 0.0006 0.045634 True 1
8 Own_Email 0.0001 0.017642 True 1
9 Family_Member_Count 0.0155 0.043479 True 1
10 begin_month 0.3308 0.065054 True 1
11 Is_Working 0.0116 0.005565 False 10
12 In_Relationship 0.0155 0.003164 False 11
13 Household_Size 0.0252 0.037530 True 1
14 Age 0.138 0.055782 False 2
15 Experience 0.0765 0.055063 True 1
16 Income_Type_Pensioner 0.0118 0.006614 False 3
17 Income_Type_Student 0.0 0.000003 False 4
18 Income_Type_Working 0.0116 0.006065 False 12
19 Education_Academic degree 0.0 0.000068 True 1
20 Education_Higher education 0.0028 0.020822 False 8
21 Education_secondary 0.0026 0.019640 False 13
22 Family_Status_Married 0.0155 0.005269 False 14
23 Family_Status_Single 0.0155 0.002369 False 15
24 Housing_Type_House / apartment 0.0016 0.005644 True 1
25 Housing_Type_With parents 0.0016 0.006984 False 7
26 paid_off 0.1327 0.050119 True 1
27 #_of_pastdues 1.1703 0.199336 True 1
28 no_loan 0.0981 0.048867 True 1

The selected features below were chosen from the results above.

In [57]:
selected_features = ["paid_off","begin_month","#_of_pastdues","no_loan","Income","Experience",
                     "Education_Higher education","Education_secondary","Own_Realty",
                     "Family_Status_Single","Family_Member_Count","Is_Working",
                     "Own_Car","Age"]

X_balance= X_balance[selected_features]
In [58]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_balance, Y_balance, random_state=100, test_size=0.3)
print(X_train.shape)
(49061, 14)

The Model

In [59]:
classifiers = {
    "LogisticRegression" : LogisticRegression(),
    "KNeighbors" : KNeighborsClassifier(),
    "SVC" : SVC(C = 0.8,kernel='linear',probability=True),
    "DecisionTree" : DecisionTreeClassifier(),
    "RandomForest" : RandomForestClassifier(n_estimators=250,max_depth=12,min_samples_leaf=16),
    "XGBoost" : XGBClassifier(max_depth=12,
                              n_estimators=250,
                              min_child_weight=8, 
                              subsample=0.8, 
                              learning_rate =0.02,    
                              seed=42),
    "CatBoost" : CatBoostClassifier(iterations=250,
                           learning_rate=0.2,
                           od_type='Iter',
                           verbose=25,
                           depth=16,
                           random_seed=42)
}
result_table = pd.DataFrame(columns=['classifiers','accuracy','presicion','recall','f1_score','fpr','tpr','auc'])

for key, classifier in classifiers.items():
    classifier.fit(X_train, y_train)
    y_predict = classifier.predict(X_test)
    
    yproba = classifier.predict_proba(X_test)[::,1]
    
    fpr, tpr, _ = roc_curve(y_test,  yproba)
    auc = roc_auc_score(y_test, yproba)
    
    conf_matrix = confusion_matrix(y_test,y_predict)
    
    result_table = result_table.append({'classifiers':key,
                                        'accuracy':accuracy_score(y_test, y_predict),
                                        'presicion':precision_score(y_test, y_predict, average='weighted'),
                                        'recall':recall_score(y_test, y_predict, average='weighted'),
                                        'f1_score':f1_score(y_test, y_predict, average='weighted'),
                                        'fpr':fpr, 
                                        'tpr':tpr, 
                                        'auc':auc
                                         }, ignore_index=True)
        
result_table.set_index('classifiers', inplace=True)
[17:35:55] WARNING: /opt/concourse/worker/volumes/live/7a2b9f41-3287-451b-6691-43e9a6c0910f/volume/xgboost-split_1619728204606/work/src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.
0:	learn: 0.4816945	total: 3.26s	remaining: 13m 31s
25:	learn: 0.0351963	total: 1m 17s	remaining: 11m 5s
50:	learn: 0.0145148	total: 2m 22s	remaining: 9m 15s
75:	learn: 0.0078211	total: 3m 29s	remaining: 8m
100:	learn: 0.0049476	total: 4m 36s	remaining: 6m 47s
125:	learn: 0.0035744	total: 5m 44s	remaining: 5m 39s
150:	learn: 0.0028916	total: 7m 16s	remaining: 4m 46s
175:	learn: 0.0028245	total: 8m 18s	remaining: 3m 29s
200:	learn: 0.0028241	total: 9m 19s	remaining: 2m 16s
225:	learn: 0.0027386	total: 10m 22s	remaining: 1m 6s
249:	learn: 0.0026875	total: 11m 23s	remaining: 0us

Results

In [60]:
fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(15,10))

for cls, ax in zip(list(classifiers.values()), axes.flatten()):
    plot_confusion_matrix(cls, 
                          X_test, 
                          y_test, 
                          ax=ax, 
                          cmap='Blues')
    ax.title.set_text(type(cls).__name__)
plt.tight_layout()  
plt.show()
In [61]:
fig = plt.figure(figsize=(8,6))

for i in result_table.index:
    plt.plot(result_table.loc[i]['fpr'], 
             result_table.loc[i]['tpr'], 
             label="{}, AUC={:.3f}".format(i, result_table.loc[i]['auc']))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("Flase Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

From the ROC Curve above, we can see that the model created by using the CATBoost Classifier returns the higher AUC. The AUC is 1 which means it has a good measure of separability.

In [62]:
result_table.iloc[:,:4]
Out[62]:
accuracy presicion recall f1_score
classifiers
LogisticRegression 0.758977 0.759348 0.758977 0.758940
KNeighbors 0.953821 0.957349 0.953821 0.953714
SVC 0.759452 0.759600 0.759452 0.759447
DecisionTree 0.967994 0.968063 0.967994 0.967991
RandomForest 0.953251 0.954331 0.953251 0.953213
XGBoost 0.985780 0.985810 0.985780 0.985780
CatBoost 0.994055 0.994058 0.994055 0.994055

The results from the table above agree with our ROC curve, the model created by using the CATBoost Classifier returns the highest accuracy.